In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from math import sqrt
plt.style.use('ggplot')
%matplotlib inline
In [2]:
sales = pd.read_csv("../data/home_data.csv")
In [3]:
sales.head(10)
Out[3]:
In [4]:
sales.dtypes
Out[4]:
The import into pandas looks correct, but the date is not formatted correctly and the zipcode is interpreted as a number rather than a category. Let's fix this.
In [5]:
sales['date'] = pd.to_datetime(sales['date'])
sales['zipcode'] = sales['zipcode'].astype('category')
In [6]:
sales.head(10)
Out[6]:
In [7]:
len(sales)
Out[7]:
In [8]:
sales.plot(kind = 'scatter', x='sqft_living', y='price')
Out[8]:
In [9]:
# Use the train_test_split function from scikit-learn (imported at the top)
# Here we have to give the size of the test (instead of the size of
# the train data in the lecture). random_state is the seed
train_data, test_data = train_test_split(sales, test_size=0.2, random_state=42)
In [10]:
print(len(train_data), len(test_data))
In [11]:
sqft_model = linear_model.LinearRegression()
train_X = np.reshape(train_data['sqft_living'], (-1, 1)) # reshape required for sklearn
train_Y = train_data['price']
sqft_model.fit(X = train_X, y = train_Y)
Out[11]:
In [12]:
print(test_data['price'].mean())
In [13]:
# Define a helper function to assess model performance
def evaluate(model, test_x, test_y):
errors = np.absolute(model.predict(test_x) - test_y)
return {'rmse': sqrt(np.mean(errors ** 2)), 'max_error' : errors.max()}
In [14]:
test_X = np.reshape(test_data['sqft_living'], (-1, 1))
test_Y = test_data['price']
evaluate(sqft_model, test_X, test_Y)
Out[14]:
In [15]:
plt.plot(test_data['sqft_living'], test_data['price'], '.',
test_data['sqft_living'], sqft_model.predict(test_X), '-')
Out[15]:
In [16]:
print(sqft_model.coef_[0], sqft_model.intercept_)
In [17]:
sales.columns
Out[17]:
In [18]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
In [19]:
sales[my_features].describe(include = 'all')
Out[19]:
In [20]:
sales[my_features].hist()
Out[20]:
In [20]:
sales.boxplot(column='price', by='zipcode', rot=-45)
Out[20]:
In [21]:
train_with_dummies = pd.get_dummies(train_data[my_features])
test_with_dummies = pd.get_dummies(test_data[my_features])
train_with_dummies.head()
Out[21]:
In [22]:
my_features_model = linear_model.LinearRegression()
my_features_model.fit(X = train_with_dummies, y = train_Y)
Out[22]:
In [23]:
print(evaluate(sqft_model, test_X, test_Y))
print(evaluate(my_features_model, test_with_dummies, test_Y))
In [24]:
house1 = sales.loc[sales['id'] == 5309101200]
In [25]:
house1
Out[25]:
In [26]:
print(house1.price)
In [27]:
# Prediction from the sqft_model (with some reshaping to prevent errors)
sqft_model.predict(np.reshape(house1['sqft_living'],(-1,1)))
Out[27]:
In [28]:
# Prediction from my_features_model (with zipcode converted to dummy)
my_features_model.predict(pd.get_dummies(house1[my_features]))
Out[28]:
In [29]:
house2 = sales.loc[sales['id'] == 1925069082]
In [30]:
house2
Out[30]:
In [31]:
sqft_model.predict(np.reshape(house2['sqft_living'],(-1,1)))
Out[31]:
In [32]:
my_features_model.predict(pd.get_dummies(house2[my_features]))
Out[32]:
In [33]:
bill_gates = house2.copy()
In [34]:
bill_gates['bedrooms'] = 8
bill_gates['bathrooms'] = 25
bill_gates['sqft_living'] = 50000
bill_gates['sqft_lot'] = 225000
bill_gates['floors'] = 4
bill_gates.loc[:,'zipcode'] = 98039
In [35]:
bill_gates['zipcode']
Out[35]:
In [36]:
sqft_model.predict(np.reshape(bill_gates['sqft_living'], (-1,1)))
Out[36]:
In [37]:
my_features_model.predict(pd.get_dummies(bill_gates[my_features]))
Out[37]: